
***************************************************************
** CLUSTERING SEQUENCES FROM OPTIMAL MATCHING ON TRANSITIONS **
***************************************	***********************	
* Labussiere, Levels and Vink (2021)


* This script needs the following inputs:
	* dissimilarity_matrix_OMTR.csv (From OMTR_dissimilarity_matrix.R)
	* individual_transitions (From OMTR_dissimilarity_matrix.R)
	* individual_states_long: data frame of individual sequences of states in long format (not shareable)

* This script produces the following outputs:
	* FIGURES S2 S3 S4
	* DATASET OMTR_clusters (to be used in OMTR_indexplots.R)
	
* Import R dissimilarity matrix	(From OMTR_dissimilarity_matrix.R)
clear all
set maxvar 14000
import delimited dissimilarity_matrix_OMTR.csv, delimiter(comma) 
drop v1 /*remove column for personal identifier*/
save  Sub_datasets\dissimilarity_matrix_OMTR.dta, replace
use Sub_datasets\dissimilarity_matrix_OMTR.dta, clear

mata: SQdist = st_data(.,.) 

mata: st_matrix("SQdist_TR", SQdist)


**************
* CLUSTERING * Comparison Ward, PAM Ward, PAM ga algorithms
**************

clear
	
	* 1/ Ward algorithm
	clustermat ward SQdist_TR, name(ward) add
	clustermat generate ward_R=groups(2/12), name(ward)

	sort ward_id
	
	* Silhouette width Ward
	set dp comma
	foreach nb in 2 3 4 5 6 7 8 9 10 {
	silhouette ward_R`nb', distmat(SQdist_TR) idvar(ward_id) silh(silh_ward_`nb'_c) ///
	title("Silhouette  plot - `nb'-cluster solution (Ward)") graphregion(color(white))
	}
	
	* Extraction medoids Ward
	foreach nb in 2 3 4 5 6 7 8 9 10 {
	getmedoids ward_R`nb', distmat(SQdist_TR) idvar(ward_id) gen(medoid_ward`nb')
	}
	
	* 2/ Partition aroud Medoids (PAM) with Ward medoids
	foreach nb in 2 3 4 5 6 7 8 9 10 {
	clpam pam_ward_R`nb', distmat(SQdist_TR) idvar(ward_id) medoids(medoid_ward`nb')
	}
	
	* Silhouette width PAM with Ward
	set dp comma
	foreach nb in 2 3 4 5 6 7 8 9 10 {
	silhouette pam_ward_R`nb', distmat(SQdist_TR) idvar(ward_id) silh(silh_PAM_ward_R`nb') ///
	title("Silhouette  plot `nb'-cluster solution (PAM Ward)") graphregion(color(white))
	}
	
		
	* 3/ Partition aroud Medoids with genetic algorithm 
	foreach nb in 2 3 4 5 6 7 8 9 10 {
	clpam pam_ga_R`nb', distmat(SQdist_TR) idvar(ward_id) medoids(`nb') ga
	}
	
	* Silhouette width PAM genetic algorithm 
	foreach nb in 2 3 4 5 6 7 8 9 10 {
	silhouette pam_ga_R`nb', distmat(SQdist_TR) idvar(ward_id) silh(silh_PAM_ga_R`nb') ///
	title("Silhouette  plot `nb'-cluster solution (PAM GA)") graphregion(color(white))
	graph export pam_ga\PAM_ga_R`nb'_silh.png, replace
	}
		
		
	* Get medoids for the chosen cluster partition
	* PAM-Ward 6-cluster solution
	getmedoids pam_ward_R6, distmat(SQdist_TR) idvar(ward_id) gen(medoid_pam_ward6)
	
	
	******************************************************************************
	* FIGURE S4
	* Average silhouette width for the six-cluster solution of the PAM Ward
	******************************************************************************
	drop silh_PAM_ward_R6
	silhouette pam_ward_R6, distmat(SQdist_TR) idvar(ward_id) silh(silh_PAM_ward_R6) ///
	title("") graphregion(color(white)) ///
	lcolor("27 158 119" "221 170 51" "187 85 102" "0 68 136" "0 0 0") 
	legend(order (1 	"Downward movers" ///
		   2	"School deregistered" ///
		   3	"Successful excursionists" ///
		   4	"Early school leavers" ///
		   5	"Upward movers" ///
		   6	"Standard movers"))


rename ward_id id
save Sub_datasets\cluster_results.dta, replace

******************************************************************************
* FIGURES S2 and S3 
* Raw data used for the figures
* Comparison average silhouette width between the three algorithm (S2)
* Average silhouette width chose algorithm (S3)
******************************************************************************
set dp comma
sum  silh_PAM_ward_R2 silh_PAM_ward_R3 silh_PAM_ward_R4 /// Figure S3
	 silh_PAM_ward_R5 silh_PAM_ward_R6 silh_PAM_ward_R7 silh_PAM_ward_R8 ///
	 silh_PAM_ward_R9 silh_PAM_ward_R10
sum  silh_ward_2_c silh_ward_3_c silh_ward_4_c silh_ward_5_c ///
	 silh_ward_6_c silh_ward_7_c silh_ward_8_c silh_ward_9_c 
sum  silh_PAM_ga_R2 silh_PAM_ga_R3 silh_PAM_ga_R4 silh_PAM_ga_R5 ///
     silh_PAM_ga_R6 silh_PAM_ga_R7 silh_PAM_ga_R8 silh_PAM_ga_R9 ///
     silh_PAM_ga_R10


*********************************************************************
* MERGE CLUSTER MEMBERSHIP WITH SEQUENCES OF STATES AND TRANSITIONS * 
*********************************************************************
* This is to produce the index plots based on transition and state 
* elements in R

clear all

* Import R matrix of individual sequences of transitions (From OMTR_dissimilarity_matrix.R)
set maxvar 14000
import delimited individual_transitions.csv.csv, delimiter(comma)

	rename v1 id 
	rename v2 T1
	rename v3 T2
	rename v4 T3
	rename v5 T4
	rename v6 T5
	rename v7 T6
	rename v8 T7
	rename v9 T8
	rename v10 T9

label define transitions ///
1	"Non-existing"       ///
2	"Standard"	         ///
3	"Downward"  	     ///
4	"Upward"	         ///
5	"Interrupted"        ///
6	"Dropout" 		     ///	
7	"Start VMBO"	     ///
8   "Start Bridge"       ///
9   "Start HAVO/VWO"     ///
					, replace
label values T1-T9 transitions

foreach nb in 1 2 3 4 5 6 7 8 9 {
label variable T`nb' "Transition `nb'"
}

reshape long T, i(id) j(time)
rename T transition

* Merge with the cluster results
************************************
merge m:1 id  using Sub_datasets\cluster_results.dta
drop _merge
sort id time
sqset transition id time, trim

* Merge with individual sequences of states
**************************************************
gen year = 2007 + time
sort id year
merge m:m id year using individual_states_long.dta
keep if _merge == 3
drop _merge
sort rinpersoon year

* Export the dataset of results to produce the index plots with R
keep rinpersoon year transition track_14_2 pam_ward_* 
rename track_14_2 track
reshape wide transition track_repeat track, i(rinpersoon) j(year)
saveold OMTR_clusters.dta, replace version(12)

